异常数据分析

In [1]:
import os
import cv2
import math
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from keras.layers import Input, Lambda
from keras.applications import Xception, xception

%matplotlib inline
%config InlineBackend.figure_format = 'retina'
Using TensorFlow backend.

数据读取

In [2]:
data_path_train = '../dogs-vs-cats-dataset/train'
image_names_train = os.listdir(data_path_train)
input_shape = (299, 299, 3)
trains = []
labels = []

# 处理标准的训练数据
for i in tqdm(range(len(image_names_train))):
    image_name = image_names_train[i]
    image_path = os.path.join(data_path_train, image_name)
    image = cv2.imread(image_path)
    image = cv2.resize(image, (input_shape[0], input_shape[1]))
    trains.append(image[:, :, ::-1])
    # cat: 0, dog: 1
    category = 1 if 'dog' in image_name else 0
    labels.append(category)

print('The trains size is:', len(trains))
100%|██████████| 25000/25000 [00:59<00:00, 418.72it/s]
The trains size is: 25000

In [15]:
data_path_test = '../dogs-vs-cats-dataset/test'
image_names_test = os.listdir(data_path_test)
tests = []

# 处理标准的测试数据
for i in tqdm(range(len(image_names_test))):
    image_name = image_names_test[i]
    image_path = os.path.join(data_path_test, image_name)
    image = cv2.imread(image_path)
    if image is None:
        print('Read test image failed:', image_path)
        continue
    image = cv2.resize(image, (input_shape[0], input_shape[1]))
    tests.append(image[:, :, ::-1])
    
print('The tests size is:', len(tests))
100%|██████████| 12500/12500 [00:29<00:00, 417.76it/s]
The tests size is: 12500

预测训练数据

In [3]:
# ImageNet 1000个类 具体内容
# 来源于:https://blog.csdn.net/zhangjunbob/article/details/53258524

dogs = [
 'n02085620','n02085782','n02085936','n02086079'
,'n02086240','n02086646','n02086910','n02087046'
,'n02087394','n02088094','n02088238','n02088364'
,'n02088466','n02088632','n02089078','n02089867'
,'n02089973','n02090379','n02090622','n02090721'
,'n02091032','n02091134','n02091244','n02091467'
,'n02091635','n02091831','n02092002','n02092339'
,'n02093256','n02093428','n02093647','n02093754'
,'n02093859','n02093991','n02094114','n02094258'
,'n02094433','n02095314','n02095570','n02095889'
,'n02096051','n02096177','n02096294','n02096437'
,'n02096585','n02097047','n02097130','n02097209'
,'n02097298','n02097474','n02097658','n02098105'
,'n02098286','n02098413','n02099267','n02099429'
,'n02099601','n02099712','n02099849','n02100236'
,'n02100583','n02100735','n02100877','n02101006'
,'n02101388','n02101556','n02102040','n02102177'
,'n02102318','n02102480','n02102973','n02104029'
,'n02104365','n02105056','n02105162','n02105251'
,'n02105412','n02105505','n02105641','n02105855'
,'n02106030','n02106166','n02106382','n02106550'
,'n02106662','n02107142','n02107312','n02107574'
,'n02107683','n02107908','n02108000','n02108089'
,'n02108422','n02108551','n02108915','n02109047'
,'n02109525','n02109961','n02110063','n02110185'
,'n02110341','n02110627','n02110806','n02110958'
,'n02111129','n02111277','n02111500','n02111889'
,'n02112018','n02112137','n02112350','n02112706'
,'n02113023','n02113186','n02113624','n02113712'
,'n02113799','n02113978']

cats=[
'n02123045','n02123159','n02123394','n02123597'
,'n02124075','n02125311','n02127052']
In [10]:
x = Input(shape=input_shape)
x = Lambda(xception.preprocess_input)(x)
model = Xception(input_tensor=x, input_shape=input_shape, weights='imagenet')

bad_imgs = []
for i in tqdm(range(len(trains))):
    X = np.expand_dims(trains[i], axis=0)
    pred = model.predict(X)
    de_pred = xception.decode_predictions(pred, top=20)[0]
    pred_types = [item[0] for item in de_pred if ((item[0] in dogs) or (item[0] in cats))]
    if (len(pred_types) == 0):
        bad_imgs.append(i)

print(len(bad_imgs), bad_imgs)
    
100%|██████████| 25000/25000 [16:08<00:00, 25.81it/s]
59 [275, 439, 584, 1021, 1239, 1677, 2406, 2668, 2747, 3159, 3363, 3456, 3828, 4037, 5084, 5625, 5661, 6073, 6290, 7487, 7641, 8828, 9419, 9483, 9874, 10286, 10856, 11588, 11666, 12201, 12345, 12834, 12893, 13435, 13519, 14331, 15423, 15550, 15784, 15836, 16015, 16628, 16645, 16881, 17030, 18401, 18513, 19866, 19909, 21115, 21137, 21777, 23019, 23308, 23726, 23974, 24520, 24895, 24976]

展示训练数据集中的异常数据

In [11]:
group = 16

for i in range(math.ceil(len(bad_imgs) / group)):
    plt.figure(figsize=(16, 16))
    for j in range(group):
        index = group * i + j
        if index >= len(bad_imgs):
            break
        img_index = bad_imgs[group * i + j]
        img = cv2.imread(os.path.join(data_path_train, image_names_train[img_index]))
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        plt.subplot(4, math.ceil(group / 4), j + 1)
        plt.imshow(img)
        plt.title(image_names_train[img_index])
    plt.show()
    
    

预测测试数据

In [16]:
bad_imgs_test = []
for i in tqdm(range(len(tests))):
    X = np.expand_dims(tests[i], axis=0)
    pred = model.predict(X)
    de_pred = xception.decode_predictions(pred, top=20)[0]
    pred_types = [item[0] for item in de_pred if ((item[0] in dogs) or (item[0] in cats))]
    if (len(pred_types) == 0):
        bad_imgs_test.append(i)

print(len(bad_imgs_test), bad_imgs_test)
100%|██████████| 12500/12500 [08:01<00:00, 25.93it/s]
29 [135, 967, 1124, 1614, 2377, 2585, 3302, 3544, 4031, 4082, 4569, 4652, 4654, 5425, 5738, 5796, 7282, 7889, 7969, 8165, 8252, 8604, 9817, 9838, 9927, 10255, 10464, 11244, 11545]

展示测试数据集中的异常数据

In [19]:
group = 16

for i in range(math.ceil(len(bad_imgs_test) / group)):
    plt.figure(figsize=(16, 16))
    for j in range(group):
        index = group * i + j
        if index >= len(bad_imgs_test):
            break
        img_index = bad_imgs_test[group * i + j]
        img = cv2.imread(os.path.join(data_path_test, image_names_test[img_index]))
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        plt.subplot(4, math.ceil(group / 4), j + 1)
        plt.imshow(img)
        plt.title(image_names_test[img_index])
    plt.show()